Load required Libraries
rm(list=ls())
library(ggplot2)
library(dplyr)
library(tidyr)
library(RMySQL)
library(stringr)
library(magrittr)
library(pcaPP)
library(directlabels)
library(proto)
Load in Wordbank tata
## OPEN DATABASE CONNECTION ##
wordbank <- src_mysql(dbname='wordbank',host="54.149.39.46",
user="wordbank",password="wordbank")
## NOW LOAD TABLES ##
source.table <- tbl(wordbank,"common_source")
admin.table <- tbl(wordbank,"common_administration")
child.table <- tbl(wordbank,"common_child")
wordmapping.table <- tbl(wordbank,"common_wordmapping")
instruments.table <- tbl(wordbank,"common_instrumentsmap")
english.ws.table <- tbl(wordbank,"instruments_english_ws")
spanish.ws.table <- tbl(wordbank,"instruments_spanish_ws")
norwegian.ws.table <- tbl(wordbank,"instruments_norwegian_ws")
danish.ws.table <- tbl(wordbank,"instruments_danish_ws")
Get kid data and put together.
# Get administration info
admins <- admin.table %>%
select(data_id,child_id,age,source_id) %>%
rename(id = data_id, child.id = child_id, source.id = source_id)
admins <- as.data.frame(admins)
# Get demographic variables for each child
demos <- select(child.table,id,sex,mom_ed,birth_order) %>%
rename(child.id = id) # Rename id fields
demos <- as.data.frame(demos)
# Join age and demographics together
child.data <- as.tbl(left_join(admins,demos))
Set up mappings and instruments.
mapping <- as.data.frame(wordmapping.table)
instruments <- as.data.frame(instruments.table) %>%
rename(instrument_id = id)
items <- left_join(mapping, instruments)
Fucntion for getting all of the data in wordbank for a given language (kid x item).
get.language.data <- function(lang.table, lang.items, lang, child.data) {
instrument.items <- lang.items %>%
filter(language == lang, form == 'WS') %>%
select(item, type, category, lexical_category) %>%
mutate(item = str_replace(item, "\\.", "_")) # Fix _/. inconsistencies
instrument.data <- as.data.frame(lang.table) %>%
rename(id = basetable_ptr_id) %>% # Rename the id
gather(item, value, -id) %>% # Arrange in longform
mutate(item = str_replace(item, "item_", "")) # Strip off item_
d <- left_join(instrument.data, instrument.items)
d <- left_join(d, child.data)
}
Get kid x item data for all languages.
d.english <- get.language.data(lang.table=english.ws.table,
lang.items=items,
lang="English",
child.data)
d.spanish <- get.language.data(lang.table=spanish.ws.table,
lang.items=items,
lang="Spanish",
child.data)
d.norwegian <- get.language.data(lang.table=norwegian.ws.table,
lang.items=items,
lang="Norwegian",
child.data)
# Norwegian data is loaded in funny -- NAs in wordform are actually 0s
d.norwegian[d.norwegian$type %in% c("word_form","word")
& is.na(d.norwegian$value),]$value = ""
d.danish <- get.language.data(lang.table=danish.ws.table,
lang.items=items,
lang="Danish",
child.data)
# Danish data is loaded in funny -- NAs in wordform are actually 0s
d.danish[d.danish$type %in% c("word_form","word")
& is.na(d.danish$value),]$value = ""
Function for getting vocab size data.
language.vocab.sizes <- function(lang.data) {
d.vocab <- lang.data %>%
filter(type == "word") %>%
group_by(age,id) %>%
summarise(vocab.sum = sum(value == "produces", na.rm=TRUE),
vocab.mean = vocab.sum/length(value))
return(d.vocab)
}
Function for getting kid x {vocab size, syntax score, morphology score} data.
According to Virginia, NAs here are just “my kid doesn’t say that,” and should be scored as 1s. So now this is computed as the total number of “complex” divided by the length.
summarise.language.data <- function(lang.data,lang) {
d.vocab <- language.vocab.sizes(lang.data)
d.complexity <- lang.data %>%
filter(type == "complexity") %>%
group_by(id) %>%
summarise(all.na = all(is.na(value)),
complexity.sum = sum(value == "complex",
na.rm=TRUE) / length(value)) %>%
mutate(complexity = ifelse(all.na,NA,complexity.sum)) %>%
select(-all.na,-complexity.sum) # Deals with ifelse
# forcing values to logical
d.wordform <- lang.data %>%
filter(type == "word_form") %>%
group_by(id) %>%
summarise(all.na = all(is.na(value)),
wordform.sum = sum(value == "produces",
na.rm=TRUE) / length(value)) %>%
mutate(wordform = ifelse(all.na,NA,wordform.sum)) %>%
select(-all.na,-wordform.sum) # Deals with ifelse
# forcing values to logical
# Spanish doesn't have ending data, so its skipped, at least for now.
# d.ending <- d %>%
# filter(type %in% c("ending")) %>%
# group_by(id) %>%
# summarise(ending_sometimes = mean(value == "sometimes" |
# value == "often",
# na.rm=TRUE),
# ending_often = mean(value == "often",
# na.rm=TRUE))
# d.composite <- left_join(d.composite, d.ending)
d.composite <- left_join(d.vocab, d.complexity)
d.composite <- left_join(d.composite, d.wordform)
# %>%
# filter(num.complexity.na == 0) %>%
# select(-num.complexity.na)
#
d.composite$language <- lang
return(d.composite)
}
Get kid x {vocab size, syntax score, morphology score} data for all languages and aggregate them.
summary.english <- summarise.language.data(d.english,"English")
summary.spanish <- summarise.language.data(d.spanish,"Spanish")
summary.norwegian <- summarise.language.data(d.norwegian,"Norwegian")
summary.danish <- summarise.language.data(d.danish,"Danish")
summary.data <- rbind_list(summary.english,summary.spanish,
summary.norwegian,summary.danish) %>%
filter(age > 15 & age < 33) %>%
mutate(age.group = cut(age, breaks = c(15, 20, 24, 28, 32)),
language = factor(language,
levels=c("English", "Spanish",
"Norwegian", "Danish")))
# gather for plotting
ms <- summary.data %>% gather(measure, score, complexity:wordform) %>%
mutate(measure = factor(measure, levels = c("wordform","complexity"),
labels = c("Word Form", "Complexity")))
ms %>%
group_by(language, age.group) %>%
summarise(n = n())
## Source: local data frame [16 x 3]
## Groups: language
##
## language age.group n
## 1 English (15,20] 4596
## 2 English (20,24] 2468
## 3 English (24,28] 2944
## 4 English (28,32] 1264
## 5 Spanish (15,20] 706
## 6 Spanish (20,24] 608
## 7 Spanish (24,28] 594
## 8 Spanish (28,32] 280
## 9 Norwegian (15,20] 3394
## 10 Norwegian (20,24] 6438
## 11 Norwegian (24,28] 6196
## 12 Norwegian (28,32] 5418
## 13 Danish (15,20] 1842
## 14 Danish (20,24] 1754
## 15 Danish (24,28] 1420
## 16 Danish (28,32] 1346
Using Age and Vocab to predict Morphology and Syntax Scores.
quartz(width=8,height=7.5)
ggplot(ms, aes(x = vocab.mean, y = score, colour = age.group, fill = age.group,
label = age.group)) +
#geom_point(alpha=.5, size=.8) +
geom_jitter(alpha=.5,size=.8) +
geom_smooth(method="lm", formula = y ~ I(x^2) - 1) +
facet_grid(language~measure) +
scale_x_continuous(limits = c(0,1), breaks = seq(0,1,.1),
name = "Vocabulary Size") +
scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),
"Score (Mean Items)") +
theme_bw(base_size = 14) +
scale_color_brewer(palette="Set1") +
scale_fill_brewer(palette="Set1")
Using Morphology scores to Predict Syntax scores.
quartz(width=8,height=7.5)
ggplot(summary.data,aes(x = wordform, y = complexity, fill=age.group,colour=age.group,
label=age.group)) +
facet_wrap( ~ language) +
geom_jitter(size=1)+
geom_smooth(method="lm", formula = y ~ exp(x) - 1) +
scale_x_continuous(limits = c(0,1.05), breaks=seq(0,1,.2),
name = "Morphology Score") +
scale_y_continuous(limits = c(0,1.05), breaks=seq(0,1,.2),"Syntax Score") +
scale_color_brewer(palette="Set1") +
scale_fill_brewer(palette="Set1") +
theme_bw(base_size = 14)
Function for computing vocabulary composition for each speaker of a language.
vocab.composition <- function(lang.data,lang) {
d.vocab <- language.vocab.sizes(lang.data)
d.cat <- lang.data %>%
filter(type == "word") %>%
group_by(id,lexical_category) %>%
summarise(cat = sum(value == "produces", na.rm=TRUE))
d.vocab.comp <- left_join(d.vocab, d.cat) %>%
mutate(prop = cat / vocab.sum) %>%
select(-cat)
d.vocab.comp$language = lang
return(d.vocab.comp)
}
Function for computing CDI form composition for all languages.
lang.vocab.composition <- function(lang.items) {
lang.words <- lang.items %>%
filter(form == "WS",type=="word")
lang.num.total <- lang.words %>%
group_by(language) %>%
summarise(n = n())
lang.vocab.comp <- lang.words %>%
group_by(language,lexical_category) %>%
summarise(num.per.cat = n())
lang.vocab.comp <- left_join(lang.vocab.comp, lang.num.total) %>%
mutate(prop.per.cat = num.per.cat/n)
return(lang.vocab.comp)
}
Get vocabulary composition data for all languages.
# get form compositions
lang.vocab.comp <- lang.vocab.composition(items) %>%
filter(lexical_category != "other")
# get data for kids in each language
vocab.comp.english <- vocab.composition(d.english,"English")
vocab.comp.spanish <- vocab.composition(d.spanish,"Spanish")
vocab.comp.norwegian <- vocab.composition(d.norwegian,"Norwegian")
vocab.comp.danish <- vocab.composition(d.danish,"Danish")
# aggregate data for all languages together
summary.vocab.comp <- rbind_list(vocab.comp.english,vocab.comp.spanish,
vocab.comp.norwegian,vocab.comp.danish) %>%
filter(age > 15 & age < 33) %>%
mutate(age.group = cut(age, breaks = c(15, 20, 24, 28, 32)),
language = factor(language,
levels=c("English", "Spanish",
"Norwegian", "Danish")),
lexical_category = factor(lexical_category,
levels = c("nouns", "predicates",
"function_words", "other"),
labels = c("Nouns", "Predicates",
"Function Words", "Other")))
Plot vocabulary composition by language.
ggplot(filter(summary.vocab.comp,lexical_category != "Other"),
aes(x=vocab.mean, y=prop, colour=lexical_category,
shape = lexical_category, fill = lexical_category,
label=lexical_category)) +
geom_point(size = 1, alpha = 0.25) +
facet_wrap(~ language) +
geom_hline(data=lang.vocab.comp,aes(yintercept=prop.per.cat),
linetype="dashed", color="grey") + #baselines for each language
geom_smooth(aes(group=lexical_category), method='loess', span=0.5) +
scale_y_continuous(name = "Proportion of total vocabulary") +
scale_x_continuous(name = "Vocabulary Size") +
geom_dl(aes(label=lexical_category), method=list("smart.grid")) +
theme_bw(base_size=14) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1")+
theme(axis.text.x = element_text(angle=-40, hjust = 0),
axis.title.y = element_text(vjust=0.35),
axis.title.x = element_text(vjust=-0.5),
legend.position="none")
Plot vocabulary composition by language and age group
ggplot(filter(summary.vocab.comp,lexical_category != "Other"),
aes(x=vocab.mean, y=prop, colour=lexical_category,
shape = lexical_category, fill = lexical_category,
label=lexical_category)) +
geom_jitter(size = 1, alpha = 0.5) +
facet_grid(language ~ age.group) +
geom_hline(data=lang.vocab.comp,aes(yintercept=prop.per.cat),
linetype="dashed", color="grey") + #baselines for each language
geom_smooth(aes(group=lexical_category), method='loess', span=0.5) +
scale_y_continuous(name = "Proportion of total vocabulary") +
scale_x_continuous(name = "Vocabulary Size") +
geom_dl(aes(label=lexical_category), method=list("smart.grid")) +
theme_bw(base_size=14) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1")+
theme(axis.text.x = element_text(angle=-40, hjust = 0),
axis.title.y = element_text(vjust=0.35),
axis.title.x = element_text(vjust=-0.5),
legend.position="none")
Plot vocabulary composition, now by lexical category.
lang.vocab.comp$lexical_category <- factor(lang.vocab.comp$lexical_category,
levels=c("function_words","nouns",
"predicates","other"),
labels=c("Function Words","Nouns",
"Predicates","Other"))
quartz()
ggplot(filter(summary.vocab.comp,lexical_category != "Other"),
aes(x=vocab.mean, y=prop, colour = age.group,
fill = age.group)) +
geom_jitter(size = 1, alpha = 0.5) +
facet_grid(language ~ lexical_category) +
geom_hline(data=lang.vocab.comp,
aes(yintercept=prop.per.cat),
linetype="dashed", color="grey") + #baselines for each language
geom_smooth(aes(group=age.group), method='loess', span=0.5) +
scale_y_continuous(name = "Proportion of total vocabulary") +
scale_x_continuous(name = "Vocabulary Size") +
theme_bw(base_size=14) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1")+
theme(axis.text.x = element_text(angle=-40, hjust = 0),
axis.title.y = element_text(vjust=0.35),
axis.title.x = element_text(vjust=-0.5),
legend.position="none")
(Old stuff that’s being kept around for possible future use.)
# Fit regressions to data
# t.lm1 <- lm(score ~ age + measure, data=filter(ms,language=="English"))
# t.lm2 <- lm(score ~ I(vocab.sum^2)*measure + age*measure, data=filter(ms,language=="English"))
# t.lm3 <- lm(score ~ I(vocab.sum^2)*measure*age.binned, data=filter(ms,language=="English"))
# t.lm4 <- lm(score ~ I(vocab.sum^2)*measure*age, data=filter(ms,language=="English"))
#
# ms$predicted <- predict.lm(t.lm3,ms)
#
# Plot by age
# ggplot(ms,aes(x = vocab, y = score, colour = measure,label=measure))+
# facet_wrap(~ age)+
# geom_jitter(size=1)+
# geom_line(aes(y=predicted),size=.5)+
# scale_color_brewer(palette="Set1") +
# scale_x_continuous(limits = c(0,681), breaks = seq(0,680,100),name = "Vocabulary (WS)") +
# scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),"Score (Mean Items)") +
# theme_bw(base_size = 14)
{r,fig.width=12,fig.height=7.5} #ggplot(ms,aes(x = vocab, y = score, colour = age.binned, fill = age.binned, # label = age.binned)) + # geom_jitter(size=1.5)+ # geom_line(aes(y=predicted),size=1) + # facet_wrap(~measure) + # scale_x_continuous(limits = c(0,681), breaks = seq(0,680,100),name = "Vocabulary (WS)") + # scale_y_continuous(limits = c(0, 1.05), breaks = seq(0,1,.2),"Score (Mean Items)") + # theme_bw(base_size = 14) + # scale_color_brewer(palette="Set1") + # scale_fill_brewer(palette="Set1") #